---
title: "FEATURE SELECTION"
author: "Melinda Manczinger"
date: "`r Sys.Date()`"
geometry: "left=2cm,right=2cm,top=2cm,bottom=2cm"
output: pdf_document
urlcolor: blue
---

## METHODS

##### Step 1 - Loading the previously created data frame

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
setwd("~/Desktop/Forest_project")
load("all_points.RData")
```

---

##### Step 2 - Creating dummy variable

We have one categorical variable in the dataset, i.e. ```forest_cover``` which needs to be transformed into a dummy variable.

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Using model.matrix() to create dummy variables from all_points$forest_cover, omitting the first level as the baseline
dummies <- as.data.frame(model.matrix(~ forest_cover - 1, data = all_points))

# Putting the two together
all_points <- cbind.data.frame(all_points, dummies)

# Saving the df
# saveRDS(all_points, file = "all_points_with_dummy.rds")
all_points <- readRDS(file = "all_points_with_dummy.rds")

# Removing original forest_cover column
all_points <- all_points[,-20]
```

---

##### Step 3 - Assessing collinearity

We investigate our data set concerning collinearity. Below you can find the correlation matrix for the data set containing the original 25 variables + 1 dummy variable:

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
library(corrplot)
library(ggplot2)

png(filename = "corrplot_all.png", width = 1000, height = 1000, res = 125)
corrplot(cor(all_points[, 7:35], method = "spearman"),
         method = "square",
         type = "lower",
         addCoef.col = T,
         order = "alphabet",
         tl.cex = 0.8,
         tl.col = "black",
         cl.cex = 0.8,
         number.cex = 0.6,
         number.digits = 1)
dev.off()
```

---

##### Step 4 - Treating collinear variables

Due to high collinearity observed mostly in between the climatic variables, we combine the following predictors:

  - ```Tmax_month``` and ```Tmin_month``` into a range variable called ```Trange_month```
  - ```Tmax``` and ```Tmin``` into a range variable called ```Trange_seasonal```

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
all_points$Trange_month <- all_points$Tmax_month-all_points$Tmin_month
all_points$Trange_seasonal <- all_points$Tmax-all_points$Tmin

# saveRDS(all_points, file = "all_points_with_ranges.rds")
all_points <- readRDS("all_points_with_ranges.rds")

# Removing original climatic variables from df
all_points <- all_points[, -c(8,9,12,13)]

# Saving df
# saveRDS(all_points, file = "all_points_without_original_columns.rds")
all_points <- readRDS("all_points_without_original_columns.rds")
```

---

##### Step 5 - Assessing collinearity

Correlation matrix for the modified dataset:

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
png(filename = "corrplot_modified.png", width = 1000, height = 1000, res = 125)
corrplot(cor(all_points[, c(7:33)], method = "spearman"),
         method = "square",
         type = "lower",
         addCoef.col = T,
         order = "alphabet",
         tl.cex = 0.8,
         tl.col = "black",
         cl.cex = 0.8,
         number.cex = 0.6,
         number.digits = 1)
dev.off()
```

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Saving the data frame
# saveRDS(object = all_points, file = "all_points_finalized.rds")
all_points <- readRDS(file = "all_points_finalized.rds")
```

---

##### Step 6 - Changing response variable's position in the dataset (=starting with 0 instead of 1)

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
all_points <- readRDS(file = "all_points_finalized.rds")
# Changing factor to numeric
all_points$fire_points <- as.numeric(all_points$fire_points)
# Assigning 0 to 2
all_points$fire_points[all_points$fire_points == 2] <- 0
# Changing order
all_points <- all_points[order(all_points$fire_points),]
# Checking
table(all_points$fire_points)

# saveRDS(object = all_points, file = "all_points_finalized.rds")
all_points <- readRDS(file = "all_points_finalized.rds")
```

---

##### Step 7 - Splitting the dataset

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# 80% training & 20% test set separated
fire1 <- all_points[all_points$fire_points == 1,]
fire0 <- all_points[all_points$fire_points == 0,]
set.seed(2024)
train1 <- sample(1:nrow(fire1), nrow(fire1)[1]*0.8)
train0 <- sample(1:nrow(fire0), nrow(fire0)[1]*0.8)
training <- rbind.data.frame(fire1[train1,], fire0[train0,]) # this will be further divided into 80-20
test <- rbind.data.frame(fire1[-train1,], fire0[-train0,]) # this will not be used until H2O implementation to ensure independence of data
```

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
set.seed(2025)
fire2 <- training[training$fire_points == 1,]
fire3 <- training[training$fire_points == 0,]

train_fs1 <- sample(1:nrow(fire2), nrow(fire2)[1]*0.8)
train_fs2 <- sample(1:nrow(fire3), nrow(fire3)[1]*0.8)
training_fs <- rbind.data.frame(fire2[train_fs1,], fire3[train_fs2,]) # this will be used for feature selection training
test_fs <- rbind.data.frame(fire2[-train_fs1,], fire3[-train_fs2,]) # this will be used for feature selection model evaluation
```

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
test <- test[order(test$fire_points),]
# saveRDS(object = test, file = "test.rds")

test_fs <- test_fs[order(test_fs$fire_points),]
# saveRDS(object = test_fs, file = "test_fs.rds")

training <- training[order(training$fire_points),]
# saveRDS(object = training, file = "training.rds")

training_fs <- training_fs[order(training_fs$fire_points),]
# saveRDS(object = training_fs, file = "training_fs.rds")

test <- readRDS(file = "test.rds")
test_fs <- readRDS(file = "test_fs.rds")
training <- readRDS(file = "training.rds")
training_fs <- readRDS(file = "training_fs.rds")
```

---

##### Step 8 - Scaling

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Copying the original sets into scaled data frames
scaled_test <- test
scaled_test_fs <- test_fs
scaled_training <- training
scaled_training_fs <- training_fs

# Scaling only explanatory (numeric) variables
scaled_test[, c(7:33)] <- scale(x = scaled_test[, c(7:33)])
scaled_test_fs[, c(7:33)] <- scale(x = scaled_test_fs[, c(7:33)])
scaled_training[, c(7:33)] <- scale(x = scaled_training[, c(7:33)])
scaled_training_fs[, c(7:33)] <- scale(x = scaled_training_fs[, c(7:33)])

# Saving the data
# saveRDS(object = scaled_test, file = "scaled_test.rds")
# saveRDS(object = scaled_test_fs, file = "scaled_test_fs.rds")
# saveRDS(object = scaled_training, file = "scaled_training.rds")
# saveRDS(object = scaled_training_fs, file = "scaled_training_fs.rds")

scaled_test <- readRDS(file = "scaled_test.rds")
scaled_test_fs <- readRDS(file = "scaled_test_fs.rds")
scaled_training <- readRDS(file = "scaled_training.rds")
scaled_training_fs <- readRDS(file = "scaled_training_fs.rds")
```

---

##### Step 9 - RFE

---

###### Step 9.1 - RFE "All in" set with rerank FALSE

- "All in" set with rerank FALSE: running RFE with all predictors and rerank argument FALSE. Additional ```pickSizeTolerance``` argument is also used to find a potentially more optimal and smaller subset.

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
library(caret)

# Setting the rfeControl function before running RFE
ctrl_all_f <- rfeControl(functions = rfFuncs, # using Random Forest variable importance score metrics
                         rerank = F, # variable importance is recalculated each time a predictor is removed
                         method = "cv", # external cross-validation
                         repeats = 5, # 5 times repeated 10-fold cv
                         number = 10) # 10-fold cv

# Creating a vector of training set outcomes (factor)
y_factor <- as.factor(training_fs[,6])

# Running the algorithm
startTime <- Sys.time()
set.seed(2000)
rfe_all_f <- rfe(x = scaled_training_fs[,-c(1:6)],
                 y = y_factor,
                 sizes = c(1:27),
                 rfeControl = ctrl_all_f)
endTime <- Sys.time()
rfe_all_f_runtime <- endTime-startTime # ~ 45 min
# saveRDS(rfe_all_f_runtime, file = "rfe_all_f_runtime.rds")

# Showing the results
print(rfe_all_f)
# Retained 16 predictors
predictors(rfe_all_f)

# Saving results
# saveRDS(rfe_all_f, file = "rfe_all_f.rds")
rfe_all_f <- readRDS("rfe_all_f.rds")

# Evaluation on test set
set.seed(2222)
postResample(predict(rfe_all_f, scaled_test_fs[,-c(1:5,8,12,13,18,19,20,21,22,29,31,33)]), as.factor(test_fs[,6]))
# Accuracy = 0.8520531
# Kappa = 0.7041063

# Additional step to minimize the negative effect of multicollinearity
# Setting the helper function ```pickSizeTolerance``` to identify potentially smaller subset size within 1% tolerance
plot(rfe_all_f)

accuracies_all_f <- data.frame(Accuracy = rfe_all_f$results$Accuracy,
                               Variables = 1:27)

within1pct <- pickSizeTolerance(x = accuracies_all_f, metric = "Accuracy", tol = 1, maximize = T)
within1pct
# Subset of 10 variables identified

# ----- Rerunning the algorithm for accepting my "worse" model

# Due to receiving error message with <within1pct>, I write a function for the above:
myPickSizeTolerance <- function(x, metric = "Accuracy", tol = 1, maximize = T) {
  return(caret::pickSizeTolerance(x, metric = "Accuracy", tol = 1, maximize = T))
}

myFuncs <- rfFuncs
myFuncs$selectSize <- myPickSizeTolerance

ctrl_all_f_1 <- rfeControl(functions = myFuncs,
                           rerank = F,
                           method = "cv",
                           repeats = 5,
                           number = 10)

# Running the algorithm
startTime1 <- Sys.time()
set.seed(2001)
rfe_all_f_1 <- rfe(x = scaled_training_fs[,-c(1:6)],
                   y = y_factor,
                   sizes = c(1:27),
                   rfeControl = ctrl_all_f_1)
endTime1 <- Sys.time()
rfe_all_f_runtime_1 <- endTime1-startTime1 # ~46 min
# saveRDS(rfe_all_f_runtime_1, file = "rfe_all_f_runtime_1.rds")

# Showing the results
print(rfe_all_f_1)
# Retained 10 predictors
predictors(rfe_all_f_1)

# Saving results
# saveRDS(rfe_all_f_1, file = "rfe_all_f_1.rds")
rfe_all_f_1 <- readRDS("rfe_all_f_1.rds")

# Variables and their importance scores
# varimp_data <- data.frame(feature = row.names(varImp(rfe_all_f_1))[1:10],
#                           importance = varImp(rfe_all_f)[1:10, 1])

# Evaluation on test set
set.seed(2002)
postResample(predict(rfe_all_f_1, scaled_test_fs[,c(7,14,15,16,17,23,24,25,26,30)]), as.factor(test_fs[,6]))
# Accuracy = 0.8399758
# Kappa = 0.6799517

# Checking other evaluation metrics for the test set
library(pROC)
# Saving predictions in model_pred object
model_pred <- predict(object = rfe_all_f_1, newdata = scaled_test_fs[,c(7,14,15,16,17,23,24,25,26,30)], type = "response")
# Saving actual classes in a separate column of model_pred
model_pred$actual <- as.factor(scaled_test_fs$fire_points)
# Copying pred and actual in a separate columns
model_pred$pred2 <- model_pred$pred
model_pred$actual2 <- model_pred$actual
# Renaming the doubled columns
levels(model_pred$actual2) <- c("non-fire", "fire")
levels(model_pred$pred2) <- c("non-fire", "fire")
# Saving the real y values and the predicted values in a roc object
colnames(model_pred) <- c("pred", "neg_pred", "pos_pred", "actual", "pred2", "actual2")
data_roc <- roc(as.numeric(model_pred$actual), as.numeric(model_pred$pos_pred))
data_roc$auc # AUC = 0.9231
# Confusion matrix (simple)
table(model_pred$actual, model_pred$pred)
# Creating confusion matrix (with caret package)
library(caret)
confusionMatrix(data = model_pred$pred2,
                reference = model_pred$actual2,
                mode = "everything",
                positive = "fire")
```

---

###### Step 9.2 - RFE "All in" set with rerank TRUE

- "All in" set with rerank TRUE: running RFE with all predictors and rerank argument TRUE. Additional ```pickSizeTolerance``` argument is also used to find a potentially more optimal and smaller subset.

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Running RFE with all predictors and rerank argument TRUE

# Debugging of rfFuncs: https://github.com/topepo/caret/issues/1271
# Error in { : task 1 failed - "dim(X) must have a positive length"
rfFuncs$rank <- function (object, x, y) 
{
  vimp <- varImp(object)
  if (is.factor(y)) {
    if (all(levels(y) %in% colnames(vimp))) {
      avImp <- apply(vimp[, levels(y), drop = FALSE], 1, # changed to drop = FALSE
                     mean)
      vimp$Overall <- avImp
    }
  }
  vimp <- vimp[order(vimp$Overall, decreasing = TRUE), , drop = FALSE]
  if (ncol(x) == 1) {
    vimp$var <- colnames(x)
  }
  else vimp$var <- rownames(vimp)
  vimp
}

ctrl_all_t <- rfeControl(functions = rfFuncs,
                         rerank = T,
                         method = "cv",
                         repeats = 5,
                         number = 10)

y_factor <- as.factor(training_fs[,6])

# Running the algorithm
startTime2 <- Sys.time()
set.seed(199)
rfe_all_t <- rfe(x = scaled_training_fs[,-c(1:6)],
                 y = y_factor,
                 sizes = c(1:27),
                 rfeControl = ctrl_all_t)
endTime2 <- Sys.time()
rfe_all_t_runtime <- endTime2-startTime2 # 46 min
# saveRDS(rfe_all_t_runtime, file = "rfe_all_t_runtime.rds")

# Showing the results
print(rfe_all_t)
# Retained 19 predictors
predictors(rfe_all_t)

# Saving results
# saveRDS(rfe_all_t, file = "rfe_all_t.rds")
rfe_all_t <- readRDS("rfe_all_t.rds")

# Evaluation on test set
set.seed(334)
postResample(predict(rfe_all_t, scaled_test_fs[,-c(1:5,12,13,18:21,29,31)]), as.factor(test_fs[,6]))
# Accuracy = 0.8532609
# Kappa = 0.7065217

# Additional step to minimize the negative effect of multicollinearity
# Setting the helper function <pickSizeTolerance> to identify potentially smaller subset size within 1% tolerance
plot(rfe_all_t)

accuracies_all_t <- data.frame(Accuracy = rfe_all_t$results$Accuracy,
                               Variables = 1:27)

within1pct_2 <- pickSizeTolerance(x = accuracies_all_t, metric = "Accuracy", tol = 1, maximize = T)
within1pct_2
# Subset of 9 variables identified

# ----- Rerunning the algorithm for accepting my "worse" model

myPickSizeTolerance2 <- function(x, metric = "Accuracy", tol = 1, maximize = T) {
  return(caret::pickSizeTolerance(x, metric = "Accuracy", tol = 1, maximize = T))
}

myFuncs2 <- rfFuncs
myFuncs2$selectSize <- myPickSizeTolerance2

ctrl_all_t_1 <- rfeControl(functions = myFuncs2,
                           rerank = T,
                           method = "cv",
                           repeats = 5,
                           number = 10)

# Running the algorithm
startTime3 <- Sys.time()
set.seed(20002)
rfe_all_t_1 <- rfe(x = scaled_training_fs[,-c(1:6)],
                   y = y_factor,
                   sizes = c(1:27),
                   rfeControl = ctrl_all_t_1)
endTime3 <- Sys.time()
rfe_all_t_runtime_1 <- endTime3-startTime3 # 44 min
# saveRDS(rfe_all_t_runtime_1, file = "rfe_all_t_runtime_1.rds")

# Showing the results
print(rfe_all_t_1)
# Retained 9 predictors
predictors(rfe_all_t_1)

# Saving results
# saveRDS(rfe_all_t_1, file = "rfe_all_t_1.rds")
rfe_all_t_1 <- readRDS("rfe_all_t_1.rds")

# Variables and their importance scores: identified 9 predictors, so I include all of the 9 in the final output
# varimp_data1 <- data.frame(feature = row.names(varImp(rfe_all_t_1))[1:9],
#                           importance = varImp(rfe_all_t_1)[1:9, 1])

# Evaluation on test set
set.seed(2007)
postResample(predict(rfe_all_t_1, scaled_test_fs[,c(7,14,15,16,17,23,24,25,30)]), as.factor(test_fs[,6]))
# Accuracy = 0.8363527
# Kappa = 0.6727053

# Checking other evaluation metrics for the test set
library(pROC)
# Saving predictions in model_pred2 object
model_pred2 <- predict(object = rfe_all_t_1, newdata = scaled_test_fs[,c(7,14,15,16,17,23,24,25,30)], type = "response")
# Saving actual classes in a separate column of model_pred2
model_pred2$actual <- as.factor(scaled_test_fs$fire_points)
# Copying pred and actual in a separate columns
model_pred2$pred2 <- model_pred2$pred
model_pred2$actual2 <- model_pred2$actual
# Renaming the doubled columns
levels(model_pred2$actual2) <- c("non-fire", "fire")
levels(model_pred2$pred2) <- c("non-fire", "fire")
# Saving the real y values and the predicted values in a roc object
colnames(model_pred2) <- c("pred", "neg_pred", "pos_pred", "actual", "pred2", "actual2")
data_roc2 <- roc(as.numeric(model_pred2$actual), as.numeric(model_pred2$pos_pred))
data_roc2$auc # AUC = 0.9159
# Confusion matrix (simple)
table(model_pred2$actual2, model_pred2$pred2)
# Creating confusion matrix (with caret package)
library(caret)
confusionMatrix(data = model_pred2$pred2,
                reference = model_pred2$actual2,
                mode = "everything",
                positive = "fire")
```

---

##### Step 10 - VIF

Variance Inflation Factor (VIF) values will be calculated in logistic regression models to determine the uncorrelated predictor set (also used in RFE in the next section). First, "permissive" VIF predictor set is identified with values less than 10, then, "restrictive" VIF with values less than 5.

###### Step 10.1 - "Permissive" VIF model

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
library(car) # for VIF calculation
library(pROC) # for ROC-AUC calculation

# Model 0 - "all in" model
set.seed(100)
startTime4 <- Sys.time()
model1 <- glm(formula = fire_points ~ .,
              family = binomial(link = "logit"),
              data = scaled_training_fs[,-c(1:5,31)]) # excluding forest_cover4 (dummy variable) as ref
endTime4 <- Sys.time()
vif_model1_runtime <- endTime4-startTime4 # < 1 min
# saveRDS(vif_model1_runtime, file = "vif_model1_runtime.rds")

summary(model1)

model1_test <- predict(object = model1, newdata = scaled_test_fs)
roc1 <- roc(scaled_test_fs$fire_points, model1_test)
roc1$auc
# Area under the curve: 0.8589

VIF <- as.data.frame(round(vif(model1),3))
VIF$Features <- rownames(VIF)
rownames(VIF) <- nrow(1:nrow(VIF))
colnames(VIF) <- c("VIF", "Features")
VIF <- VIF[, c(2,1)]
# saveRDS(object = VIF, file = "VIF.rds")
VIF <- readRDS(file = "VIF.rds")
```

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Model 1
set.seed(101)
startTime5 <- Sys.time()
model2 <- glm(formula = fire_points ~ .,
              family = binomial(link = "logit"),
              data = scaled_training_fs[,-c(1:5,9,31)]) # excluding Tave
endTime5 <- Sys.time()
vif_model2_runtime <- endTime5-startTime5 # < 1 min
# saveRDS(vif_model2_runtime, file = "vif_model2_runtime.rds")

summary(model2)

model2_test <- predict(object = model2, newdata = scaled_test_fs)
roc2 <- roc(scaled_test_fs$fire_points, model2_test)
roc2$auc
# Area under the curve: 0.8592

VIF2 <- as.data.frame(round(vif(model2),3))
VIF2$Features <- rownames(VIF2)
rownames(VIF2) <- nrow(1:nrow(VIF2))
colnames(VIF2) <- c("VIF", "Features")
VIF2 <- VIF2[, c(2,1)]
# saveRDS(object = VIF2, file = "VIF2.rds")
VIF2 <- readRDS(file = "VIF2.rds")
```

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Model 2
set.seed(102)
startTime6 <- Sys.time()
model3 <- glm(formula = fire_points ~ .,
              family = binomial(link = "logit"),
              data = scaled_training_fs[,-c(1:5,9,13,31)]) # excluding Tave & AHM
endTime6 <- Sys.time()
vif_model3_runtime <- endTime6-startTime6 # < 1 min
# saveRDS(vif_model3_runtime, file = "vif_model3_runtime.rds")

summary(model3)

model3_test <- predict(object = model3, newdata = scaled_test_fs)
roc3 <- roc(scaled_test_fs$fire_points, model3_test)
roc3$auc
# Area under the curve: 0.8587

VIF3 <- as.data.frame(round(vif(model3),3))
VIF3$Features <- rownames(VIF3)
rownames(VIF3) <- nrow(1:nrow(VIF3))
colnames(VIF3) <- c("VIF", "Features")
VIF3 <- VIF3[, c(2,1)]
# saveRDS(object = VIF3, file = "VIF3.rds")
VIF3 <- readRDS(file = "VIF3.rds")
```

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Model 3
set.seed(103)
startTime7 <- Sys.time()
model4 <- glm(formula = fire_points ~ .,
              family = binomial(link = "logit"),
              data = scaled_training_fs[,-c(1:5,9,13,30:31)]) # excluding Tave & AHM & forest_cover3
endTime7 <- Sys.time()
vif_model4_runtime <- endTime7-startTime7 # < 1 min
# saveRDS(vif_model4_runtime, file = "vif_model4_runtime.rds")

summary(model4)

model4_test <- predict(object = model4, newdata = scaled_test_fs)
roc4 <- roc(scaled_test_fs$fire_points, model4_test)
roc4$auc
# Area under the curve: 0.8392

VIF4 <- as.data.frame(round(vif(model4),3))
VIF4$Features <- rownames(VIF4)
rownames(VIF4) <- nrow(1:nrow(VIF4))
colnames(VIF4) <- c("VIF", "Features")
VIF4 <- VIF4[, c(2,1)]
# saveRDS(object = VIF4, file = "VIF4.rds")
VIF4 <- readRDS(file = "VIF4.rds")

# Checking other evaluation metrics for the test set
model4_pred <- as.data.frame(predict(object = model4, newdata = scaled_test_fs, type = "response"))
model4_pred$pred_class <- ifelse(model4_pred$`predict(object = model4, newdata = scaled_test_fs, type = "response")`>= 0.5,1,0)
model4_pred$actual <- scaled_test_fs$fire_points
colnames(model4_pred) <- c("neg_pred", "pred_class", "actual")
# Copying pred and actual in a separate columns
model4_pred$pred2 <- as.factor(model4_pred$pred_class)
model4_pred$actual2 <- as.factor(model4_pred$actual)
# Renaming the doubled columns
levels(model4_pred$actual2) <- c("non-fire", "fire")
levels(model4_pred$pred2) <- c("non-fire", "fire")
# Confusion matrix (simple)
table(model4_pred$actual2, model4_pred$pred2)
# Creating confusion matrix (with caret package)
library(caret)
confusionMatrix(data = model4_pred$pred2,
                reference = model4_pred$actual2,
                mode = "everything",
                positive = "fire")
```

---

###### Step 10.2 - Assessing collinearity

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
library(corrplot)
png(filename = "corrplot_VIF.png", width = 1000, height = 1000, res = 125)
corrplot(cor(scaled_training_fs[,-c(1:6,9,13,30:31)], method = "spearman"),
         method = "square",
         type = "lower",
         addCoef.col = T,
         order = "alphabet",
         tl.cex = 0.8,
         tl.col = "black",
         cl.cex = 0.8,
         number.cex = 0.6,
         number.digits = 1)
dev.off()

correlations <- cor(scaled_training_fs[,-c(1:6,9,13,30:31)], method = "spearman")
min(correlations) # -0.7539773
max(correlations[correlations != diag(correlations)]) # 0.9160942
```

---

###### Step 10.3 - "Restrictive" VIF model

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Model 4
set.seed(104)
startTime8 <- Sys.time()
model5 <- glm(formula = fire_points ~ .,
              family = binomial(link = "logit"),
              data = scaled_training_fs[,-c(1:5,9,13,16,30:31)]) # excluding also elevation
endTime8 <- Sys.time()
vif_model5_runtime <- endTime8-startTime8 # < 1 min
# saveRDS(vif_model5_runtime, file = "vif_model5_runtime.rds")

summary(model5)

model5_test <- predict(object = model5, newdata = scaled_test_fs)
roc5 <- roc(scaled_test_fs$fire_points, model5_test)
roc5$auc
# Area under the curve: 0.8387

VIF5 <- as.data.frame(round(vif(model5),3))
VIF5$Features <- rownames(VIF5)
rownames(VIF5) <- nrow(1:nrow(VIF5))
colnames(VIF5) <- c("VIF", "Features")
VIF5 <- VIF5[, c(2,1)]
# saveRDS(object = VIF5, file = "VIF5.rds")
VIF5 <- readRDS(file = "VIF5.rds")
```

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Model 5
set.seed(105)
startTime9 <- Sys.time()
model6 <- glm(formula = fire_points ~ .,
              family = binomial(link = "logit"),
              data = scaled_training_fs[,-c(1:5,9,13,16,30:32)]) # excluding also Trange_month
endTime9 <- Sys.time()
vif_model6_runtime <- endTime9-startTime9 # < 1 min
# saveRDS(vif_model6_runtime, file = "vif_model6_runtime.rds")

summary(model6)

model6_test <- predict(object = model6, newdata = scaled_test_fs)
roc6 <- roc(scaled_test_fs$fire_points, model6_test)
roc6$auc
# Area under the curve: 0.8385

VIF6 <- as.data.frame(round(vif(model6),3))
VIF6$Features <- rownames(VIF6)
rownames(VIF6) <- nrow(1:nrow(VIF6))
colnames(VIF6) <- c("VIF", "Features")
VIF6 <- VIF6[, c(2,1)]
# saveRDS(object = VIF6, file = "VIF6.rds")
VIF6 <- readRDS(file = "VIF6.rds")
```

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Model 6
set.seed(106)
startTime10 <- Sys.time()
model7 <- glm(formula = fire_points ~ .,
              family = binomial(link = "logit"),
              data = scaled_training_fs[,-c(1:5,9,13:14,16,30:32)]) # excluding also NDVI_T
endTime10 <- Sys.time()
vif_model7_runtime <- endTime10-startTime10 # < 1 min
# saveRDS(vif_model7_runtime, file = "vif_model7_runtime.rds")

summary(model7)

model7_test <- predict(object = model7, newdata = scaled_test_fs)
roc7 <- roc(scaled_test_fs$fire_points, model7_test)
roc7$auc
# Area under the curve: 0.8396

VIF7 <- as.data.frame(round(vif(model7),3))
VIF7$Features <- rownames(VIF7)
rownames(VIF7) <- nrow(1:nrow(VIF7))
colnames(VIF7) <- c("VIF", "Features")
VIF7 <- VIF7[, c(2,1)]
# saveRDS(object = VIF7, file = "VIF7.rds")
VIF7 <- readRDS(file = "VIF7.rds")

# Checking other evaluation metrics for the test set
model7_pred <- as.data.frame(predict(object = model7, newdata = scaled_test_fs, type = "response"))
model7_pred$pred_class <- ifelse(model7_pred$`predict(object = model7, newdata = scaled_test_fs, type = "response")`>= 0.5,1,0)
model7_pred$actual <- scaled_test_fs$fire_points
colnames(model7_pred) <- c("neg_pred", "pred_class", "actual")
# Copying pred and actual in a separate columns
model7_pred$pred2 <- as.factor(model7_pred$pred_class)
model7_pred$actual2 <- as.factor(model7_pred$actual)
# Renaming the doubled columns
levels(model7_pred$actual2) <- c("non-fire", "fire")
levels(model7_pred$pred2) <- c("non-fire", "fire")
# Confusion matrix (simple)
table(model7_pred$actual2, model7_pred$pred2)
# Creating confusion matrix (with caret package)
library(caret)
confusionMatrix(data = model7_pred$pred2,
                reference = model7_pred$actual2,
                mode = "everything",
                positive = "fire")
```

---

###### Step 10.4 - Assessing collinearity

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
library(corrplot)
png(filename = "corrplot_VIF_R.png", width = 1000, height = 1000, res = 125)
corrplot(cor(scaled_training_fs[,-c(1:6,9,13:14,16,30:32)], method = "spearman"),
         method = "square",
         type = "lower",
         addCoef.col = T,
         order = "alphabet",
         tl.cex = 0.8,
         tl.col = "black",
         cl.cex = 0.8,
         number.cex = 0.6,
         number.digits = 1)
dev.off()

correlations_r <- cor(scaled_training_fs[,-c(1:6,9,13:14,16,30:32)], method = "spearman")
min(correlations_r) # -0.7085999
max(correlations_r[correlations_r != diag(correlations_r)]) # 0.6043098
```

---

##### Step 11 - RFE with VIF < 10

---

###### Step 11.1 - RFE Uncorrelated predictor set with rerank FALSE

The obtained VIF results will be used in itself and in conjunction with RFE with rerank FALSE (running RFE with 23 predictors and rerank argument FALSE).

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
library(caret)

ctrl_vif_f <- rfeControl(functions = rfFuncs,
                         rerank = F,
                         method = "cv",
                         repeats = 5,
                         number = 10)

y_factor <- as.factor(training_fs[,6])

startTime8 <- Sys.time()
set.seed(3001)
rfe_vif_f <- rfe(x = scaled_training_fs[,-c(1:6,9,13,30:31)],
                 y = y_factor,
                 sizes = c(1:23),
                 rfeControl = ctrl_vif_f)
endTime8 <- Sys.time()
rfe_vif_f_runtime <- endTime8-startTime8 # ~34 mins
# saveRDS(rfe_vif_f_runtime, file = "rfe_vif_f_runtime.rds")

print(rfe_vif_f)

predictors(rfe_vif_f)

# Saving results
# saveRDS(rfe_vif_f, file = "rfe_vif_f.rds")
rfe_vif_f <- readRDS("rfe_vif_f.rds")

# Variables and their importance scores
# varimp_data <- data.frame(feature = row.names(varImp(rfe_vif_f))[1:19],
#                           importance = varImp(rfe_vif_f)[1:19, 1])

# Evaluation on test set
set.seed(3004)
postResample(predict(rfe_vif_f, scaled_test_fs[,-c(1:6,9,12:13,18:19,29:31)]), as.factor(test_fs[,6]))
# Accuracy = 0.8599034
# Kappa = 0.7198068

# Checking other evaluation metrics for the test set
library(pROC)
# Saving predictions in model_pred3 object
model_pred3 <- predict(object = rfe_vif_f, newdata = scaled_test_fs[,-c(1:6,9,12:13,18:19,29:31)], type = "response")
# Saving actual classes in a separate column of model_pred3
model_pred3$actual <- as.factor(scaled_test_fs$fire_points)
# Copying pred and actual in a separate columns
model_pred3$pred2 <- model_pred3$pred
model_pred3$actual2 <- model_pred3$actual
# Renaming the doubled columns
levels(model_pred3$actual2) <- c("non-fire", "fire")
levels(model_pred3$pred2) <- c("non-fire", "fire")
# Saving the real y values and the predicted values in a roc object
colnames(model_pred3) <- c("pred", "neg_pred", "pos_pred", "actual", "pred2", "actual2")
data_roc3 <- roc(as.numeric(model_pred3$actual), as.numeric(model_pred3$pos_pred))
data_roc3$auc # AUC = 0.9314
# Confusion matrix (simple)
table(model_pred3$actual2, model_pred3$pred2)
# Creating confusion matrix (with caret package)
library(caret)
confusionMatrix(data = model_pred3$pred2,
                reference = model_pred3$actual2,
                mode = "everything",
                positive = "fire")
```

---

###### Step 11.2 - RFE Uncorrelated predictor set with rerank TRUE

The obtained VIF results will be used in RFE with rerank TRUE: running RFE with 23 predictors and rerank argument TRUE.

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Debugging of rfFuncs: https://github.com/topepo/caret/issues/1271
# Error in { : task 1 failed - "dim(X) must have a positive length"
rfFuncs$rank <- function (object, x, y) 
{
  vimp <- varImp(object)
  if (is.factor(y)) {
    if (all(levels(y) %in% colnames(vimp))) {
      avImp <- apply(vimp[, levels(y), drop = FALSE], 1, # changed to drop = FALSE
                     mean)
      vimp$Overall <- avImp
    }
  }
  vimp <- vimp[order(vimp$Overall, decreasing = TRUE), , drop = FALSE]
  if (ncol(x) == 1) {
    vimp$var <- colnames(x)
  }
  else vimp$var <- rownames(vimp)
  vimp
}

ctrl_vif_t <- rfeControl(functions = rfFuncs,
                         rerank = T,
                         method = "cv",
                         repeats = 5,
                         number = 10)

y_factor <- as.factor(training_fs[,6])

startTime9 <- Sys.time()
set.seed(3005)
rfe_vif_t <- rfe(x = scaled_training_fs[,-c(1:6,9,13,30:31)],
                 y = y_factor,
                 sizes = c(1:23),
                 rfeControl = ctrl_vif_t)
endTime9 <- Sys.time()
rfe_vif_t_runtime <- endTime9-startTime9 # ~35 mins
# saveRDS(rfe_vif_t_runtime, file = "rfe_vif_t_runtime.rds")

print(rfe_vif_t)

predictors(rfe_vif_t)

# Saving results
# saveRDS(rfe_vif_t, file = "rfe_vif_t.rds")
rfe_vif_t <- readRDS("rfe_vif_t.rds")

# Variables and their importance scores
# varimp_data <- data.frame(feature = row.names(varImp(rfe_vif_t))[1:21],
#                           importance = varImp(rfe_vif_t)[1:21, 1])

# Evaluation on test set
set.seed(2008)
postResample(predict(rfe_vif_t, scaled_test_fs[,-c(1:6,9,13,18:19,30:31)]), as.factor(test_fs[,6]))
# Accuracy = 0.8550725
# Kappa = 0.7101449

# Checking other evaluation metrics for the test set
library(pROC)
# Saving predictions in model_pred4 object
model_pred4 <- predict(object = rfe_vif_t, newdata = scaled_test_fs[,-c(1:6,9,13,18:19,30:31)], type = "response")
# Saving actual classes in a separate column of model_pred4
model_pred4$actual <- as.factor(scaled_test_fs$fire_points)
# Copying pred and actual in a separate columns
model_pred4$pred2 <- model_pred4$pred
model_pred4$actual2 <- model_pred4$actual
# Renaming the doubled columns
levels(model_pred4$actual2) <- c("non-fire", "fire")
levels(model_pred4$pred2) <- c("non-fire", "fire")
# Saving the real y values and the predicted values in a roc object
colnames(model_pred4) <- c("pred", "neg_pred", "pos_pred", "actual", "pred2", "actual2")
data_roc4 <- roc(as.numeric(model_pred4$actual), as.numeric(model_pred4$pos_pred))
data_roc4$auc # AUC = 0.933
# Confusion matrix (simple)
table(model_pred4$actual2, model_pred4$pred2)
# Creating confusion matrix (with caret package)
library(caret)
confusionMatrix(data = model_pred4$pred2,
                reference = model_pred4$actual2,
                mode = "everything",
                positive = "fire")
```

---

### Step 12 - RFE with VIF < 5

###### Step 12.1 - RFE Uncorrelated predictor set with rerank FALSE

The obtained VIF results will be used in itself and in conjunction with RFE with rerank FALSE (running RFE with 20 predictors and rerank argument FALSE).

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
library(caret)

ctrl_vif_5_f <- rfeControl(functions = rfFuncs,
                         rerank = F,
                         method = "cv",
                         repeats = 5,
                         number = 10)

y_factor <- as.factor(training_fs[,6])

startTime11 <- Sys.time()
set.seed(13)
rfe_vif_5_f <- rfe(x = scaled_training_fs[,VIF7$Features],
                 y = y_factor,
                 sizes = c(1:20),
                 rfeControl = ctrl_vif_5_f)
endTime11 <- Sys.time()
rfe_vif_5_f_runtime <- endTime11-startTime11 # ~27 mins
# saveRDS(rfe_vif_5_f_runtime, file = "rfe_vif_5_f_runtime.rds")

print(rfe_vif_5_f)

predictors(rfe_vif_5_f)

# Saving results
# saveRDS(rfe_vif_5_f, file = "rfe_vif_5_f.rds")
rfe_vif_5_f <- readRDS("rfe_vif_5_f.rds")

# Variables and their importance scores
# varimp_data <- data.frame(feature = row.names(varImp(rfe_vif_5_f))[1:17],
#                           importance = varImp(rfe_vif_5_f)[1:17, 1])

# Evaluation on test set
set.seed(15)
postResample(predict(rfe_vif_5_f, scaled_test_fs[,VIF7$Features]), as.factor(test_fs[,6]))
# Accuracy = 0.8659420
# Kappa = 0.7318841

# Checking other evaluation metrics for the test set
library(pROC)
# Saving predictions in model_pred5 object
model_pred5 <- predict(object = rfe_vif_5_f, newdata = scaled_test_fs[,VIF7$Features], type = "response")
# Saving actual classes in a separate column of model_pred5
model_pred5$actual <- as.factor(scaled_test_fs$fire_points)
# Copying pred and actual in a separate columns
model_pred5$pred2 <- model_pred5$pred
model_pred5$actual2 <- model_pred5$actual
# Renaming the doubled columns
levels(model_pred5$actual2) <- c("non-fire", "fire")
levels(model_pred5$pred2) <- c("non-fire", "fire")
# Saving the real y values and the predicted values in a roc object
colnames(model_pred5) <- c("pred", "neg_pred", "pos_pred", "actual", "pred2", "actual2")
data_roc5 <- roc(as.numeric(model_pred5$actual), as.numeric(model_pred5$pos_pred))
data_roc5$auc # AUC = 0.9348
# Confusion matrix (simple)
table(model_pred5$actual2, model_pred5$pred2)
# Creating confusion matrix (with caret package)
library(caret)
confusionMatrix(data = model_pred5$pred2,
                reference = model_pred5$actual2,
                mode = "everything",
                positive = "fire")
```

---

###### Step 12.2 - RFE Uncorrelated predictor set with rerank TRUE

The obtained VIF results will be used in RFE with rerank TRUE: running RFE with 20 predictors and rerank argument TRUE.

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Debugging of rfFuncs: https://github.com/topepo/caret/issues/1271
# Error in { : task 1 failed - "dim(X) must have a positive length"
rfFuncs$rank <- function (object, x, y) 
{
  vimp <- varImp(object)
  if (is.factor(y)) {
    if (all(levels(y) %in% colnames(vimp))) {
      avImp <- apply(vimp[, levels(y), drop = FALSE], 1, # changed to drop = FALSE
                     mean)
      vimp$Overall <- avImp
    }
  }
  vimp <- vimp[order(vimp$Overall, decreasing = TRUE), , drop = FALSE]
  if (ncol(x) == 1) {
    vimp$var <- colnames(x)
  }
  else vimp$var <- rownames(vimp)
  vimp
}

ctrl_vif_5_t <- rfeControl(functions = rfFuncs,
                         rerank = T,
                         method = "cv",
                         repeats = 5,
                         number = 10)

y_factor <- as.factor(training_fs[,6])

startTime12 <- Sys.time()
set.seed(15)
rfe_vif_5_t <- rfe(x = scaled_training_fs[,VIF7$Features],
                 y = y_factor,
                 sizes = c(1:20),
                 rfeControl = ctrl_vif_5_t)
endTime12 <- Sys.time()
rfe_vif_5_t_runtime <- endTime12-startTime12 # ~26 mins
# saveRDS(rfe_vif_5_t_runtime, file = "rfe_vif_5_t_runtime.rds")

print(rfe_vif_5_t)

predictors(rfe_vif_5_t)

# Saving results
# saveRDS(rfe_vif_5_t, file = "rfe_vif_5_t.rds")
rfe_vif_5_t <- readRDS("rfe_vif_5_t.rds")

# Variables and their importance scores
# varimp_data <- data.frame(feature = row.names(varImp(rfe_vif_5_t)),
#                           importance = varImp(rfe_vif_5_t))

# Evaluation on test set
set.seed(16)
postResample(predict(rfe_vif_5_t, scaled_test_fs[,VIF7$Features]), as.factor(test_fs[,6]))
# Accuracy = 0.503019324
# Kappa = 0.006038647

# Checking other evaluation metrics for the test set
library(pROC)
# Saving predictions in model_pred6 object
model_pred6 <- predict(object = rfe_vif_5_t, newdata = scaled_test_fs[,VIF7$Features], type = "response")
# Saving actual classes in a separate column of model_pred6
model_pred6$actual <- as.factor(scaled_test_fs$fire_points)
# Copying pred and actual in a separate columns
model_pred6$pred2 <- model_pred6$pred
model_pred6$actual2 <- model_pred6$actual
# Renaming the doubled columns
levels(model_pred6$actual2) <- c("non-fire", "fire")
levels(model_pred6$pred2) <- c("non-fire", "fire")
# Saving the real y values and the predicted values in a roc object
colnames(model_pred6) <- c("pred", "neg_pred", "pos_pred", "actual", "pred2", "actual2")
data_roc6 <- roc(as.numeric(model_pred6$actual), as.numeric(model_pred6$pos_pred))
data_roc6$auc # AUC = 0.4398
# Confusion matrix (simple)
table(model_pred6$actual2, model_pred6$pred2)
# Creating confusion matrix (with caret package)
library(caret)
confusionMatrix(data = model_pred6$pred2,
                reference = model_pred6$actual2,
                mode = "everything",
                positive = "fire")
```

---

### Step 13 - Lasso

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
library(glmnet)
library(pROC)

# Finding the optimal value of lambda with 10-fold CV
x <- model.matrix(fire_points ~., scaled_training_fs[,-c(1:5)])
y <- scaled_training_fs$fire_points

set.seed(77)
cv.lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial", type.measure = "auc", nfolds = 10)

# saveRDS(object = cv.lasso, file = "cv.lasso.rds")
cv.lasso <- readRDS(file = "I. DATA PREPARATION & II. FEATURE SELECTION/cv.lasso.rds")

png(filename = "cv.lasso.png", width = 1000, height = 1000, res = 125)
plot(cv.lasso)
dev.off()

# Lambda min
lambda_min <- cv.lasso$lambda.min
lambda_min # 0.0003007647

# Lambda 1se
lambda_1se <- cv.lasso$lambda.1se
lambda_1se # 0.01496913

# Viewing lambda values that holds AUC values respectively
View(cbind(cv.lasso$lambda, cv.lasso$cvm))

coef(cv.lasso, cv.lasso$lambda.min) # -2 variables = 25
coef(cv.lasso, cv.lasso$lambda.1se) # -13 variables = 14

#---

# Training the model with lambda min
lasso_mod1 <- glmnet(x, y, alpha = 1, family = "binomial", lambda = lambda_min)
coef(lasso_mod1)
# saveRDS(object = lasso_mod1, file = "lasso_mod1.rds")
lasso_mod1 <- readRDS(file = "lasso_mod1.rds")

# Checking other evaluation metrics for the test set
library(pROC)
# Saving predictions in model_pred7 object
model_pred7 <- as.data.frame(predict(object = lasso_mod1, newx = model.matrix(fire_points ~., scaled_test_fs[,-c(1:5)]), type = "class"))
# Saving actual classes in a separate column of model_pred7
model_pred7$actual <- as.factor(scaled_test_fs$fire_points)
# Copying pred and actual in a separate columns
model_pred7$pred2 <- as.factor(model_pred7$s0)
model_pred7$actual2 <- model_pred7$actual
# Renaming levels for the doubled columns
levels(model_pred7$actual2) <- c("non-fire", "fire")
levels(model_pred7$pred2) <- c("non-fire", "fire")
# Confusion matrix (simple)
table(model_pred7$actual2, model_pred7$pred2)
# Creating confusion matrix (with caret package)
library(caret)
confusionMatrix(data = model_pred7$pred2,
                reference = model_pred7$actual2,
                mode = "everything",
                positive = "fire")
# Saving the real y values and the predicted values in a roc object
model_pred7b <- as.data.frame(predict(object = lasso_mod1, newx = model.matrix(fire_points ~., scaled_test_fs[,-c(1:5)]), type = "response"))
model_pred7b$actual <- scaled_test_fs$fire_points
data_roc7 <- roc(model_pred7b$actual, model_pred7b$s0)
data_roc7$auc # AUC = 0.859

#---

# Training the model with lambda 1se
lasso_mod2 <- glmnet(x, y, alpha = 1, family = "binomial", lambda = lambda_1se)
coef(lasso_mod2)
# saveRDS(object = lasso_mod2, file = "lasso_mod2.rds")
lasso_mod2 <- readRDS(file = "lasso_mod2.rds")

# Checking other evaluation metrics for the test set
library(pROC)
# Saving predictions in model_pred8 object
model_pred8 <- as.data.frame(predict(object = lasso_mod2, newx = model.matrix(fire_points ~., scaled_test_fs[,-c(1:5)]), type = "class"))
# Saving actual classes in a separate column of model_pred8
model_pred8$actual <- as.factor(scaled_test_fs$fire_points)
# Copying pred and actual in a separate columns
model_pred8$pred2 <- as.factor(model_pred8$s0)
model_pred8$actual2 <- model_pred8$actual
# Renaming levels for the doubled columns
levels(model_pred8$actual2) <- c("non-fire", "fire")
levels(model_pred8$pred2) <- c("non-fire", "fire")
# Confusion matrix (simple)
table(model_pred8$actual2, model_pred8$pred2)
# Creating confusion matrix (with caret package)
library(caret)
confusionMatrix(data = model_pred8$pred2,
                reference = model_pred8$actual2,
                mode = "everything",
                positive = "fire")
# Saving the real y values and the predicted values in a roc object
model_pred8b <- as.data.frame(predict(object = lasso_mod2, newx = model.matrix(fire_points ~., scaled_test_fs[,-c(1:5)]), type = "response"))
model_pred8b$actual <- scaled_test_fs$fire_points
data_roc8 <- roc(model_pred8b$actual, model_pred8b$s0)
data_roc8$auc # AUC = 0.8551
```

---

### Step 14 - Saving the datasets separately for H2O

```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
# Permissive VIF set (pVIF), n = 23
first_six_columns <- names(scaled_training[1:6])
columns_to_select <- unique(c(first_six_columns, VIF4$Features))
permissive_VIF_set <- scaled_training[,columns_to_select]
# saveRDS(object = permissive_VIF_set, file = "permissive_VIF_set.rds")

# ---

# Restrictive VIF set (rVIF), n = 20
columns_to_select <- unique(c(first_six_columns, VIF7$Features))
restrictive_VIF_set <- scaled_training[,columns_to_select]
# saveRDS(object = restrictive_VIF_set, file = "restrictive_VIF_set.rds")

# ---

# “All in” rerank F (RFE-all-in-F), n = 10
columns_to_select <- unique(c(first_six_columns, rfe_all_f_1$optVariables))
RFE_all_f <- scaled_training[,columns_to_select]
# saveRDS(object = RFE_all_f, file = "RFE_all_f_model.rds")

# ---

# “All in” rerank T (RFE-all-in-T), n = 9
columns_to_select <- unique(c(first_six_columns, rfe_all_t_1$optVariables))
RFE_all_t <- scaled_training[,columns_to_select]
# saveRDS(object = RFE_all_t, file = "RFE_all_f_ds.rds")

# ---

# Permissive VIF set (n = 23) rerank F (RFE-pVIF-F), n = 19
columns_to_select <- unique(c(first_six_columns, rfe_vif_f$optVariables))
RFE_pvif_f <- scaled_training[,columns_to_select]
# saveRDS(object = RFE_pvif_f, file = "RFE_pvif_f.rds")

# ---

# Permissive VIF set (n = 23) rerank T (RFE-pVIF-T), n = 21
columns_to_select <- unique(c(first_six_columns, rfe_vif_t$optVariables))
RFE_pvif_t <- scaled_training[,columns_to_select]
# saveRDS(object = RFE_pvif_t, file = "RFE_pvif_t.rds")

# ---

# Restrictive VIF set (n = 20) rerank F (RFE-rVIF-F), n = 17
columns_to_select <- unique(c(first_six_columns, rfe_vif_5_f$optVariables))
RFE_rvif_f <- scaled_training[,columns_to_select]
# saveRDS(object = RFE_rvif_f, file = "RFE_rvif_f.rds")

# ---

# Restrictive VIF set(n = 20) rerank T (RFE-rVIF-T), n = 1
columns_to_select <- unique(c(first_six_columns, rfe_vif_5_t$optVariables))
RFE_rvif_t <- scaled_training[,columns_to_select]
# saveRDS(object = RFE_rvif_t, file = "RFE_rvif_t.rds")

# ---

# lamda.min, n = 25
variables <- coef(lasso_mod1)
variables <- variables@Dimnames[[1]]
variables <- as.data.frame(variables)
variables <- variables[-c(1:2,18,25),] # excluding intercept, railway, forest_cover2

columns_to_select <- unique(c(first_six_columns, variables))
LASSO_min <- scaled_training[,columns_to_select]
# saveRDS(object = LASSO_min, file = "LASSO_min.rds")

# ---

# lamda.1se, n = 14
variables <- coef(lasso_mod2)
variables <- variables@Dimnames[[1]]
variables <- as.data.frame(variables)
variables <- variables[c(3,5,7,10:13,16,19:20,22,24,26:27),] # including the 14 variables chosen

columns_to_select <- unique(c(first_six_columns, variables))
LASSO_1se <- scaled_training[,columns_to_select]
# saveRDS(object = LASSO_1se, file = "LASSO_1se.rds")
```




